First, we want to set some variables depending on the samples we want to use in the PCA, etc. change these before running the analysis
# define input variables -------
#quick sample information
pop <- read.table("/dss/dsshome1/lxc0B/ra52qed/gitlab/swallow.projects/scripts/0_sample.lists/sample.names.RGT.sub.pop.txt", header=T)
#any other metadata to be included
info <- read_xlsx("/dss/dsshome1/lxc0B/ra52qed/gitlab/swallow.projects/scripts/0_sample.lists/NCBI_info_updated_20220727.xlsx")
## New names:
## • `` -> `...17`
## • `` -> `...42`
#number of samples
N = 35
#color palettes for plotting
color_sub <- c( #"#810070", #E
"#00316e", #G
"#d30000", #R
#"#9716a8", #RG
#"#FFA500", #RT
#"#be6400", #S
"#f4d000", #T
#"#622a0f", #TV
"grey") #NA/unknown
Here, we can also specify a subset of samples if there are individuals we want to exclude
#create percent methylation df
prop.df <- prop.168.20miss.merged
prop.df <- prop.df %>%
dplyr::select(contains("RU_13_R_") | contains("_G_") | contains("_T_"))
Calculate only sites that show +/- 10% diff in methylation and use those for PCA??
# Define population groups based on column substrings
population_groups <- list(
Rr= "RU_13_R_",
G = "_G_",
T = "_T_"
)
# Calculate the mean for each population group
population_means <- sapply(names(population_groups), function(pop) {
cols <- grep(population_groups[[pop]], colnames(prop.df), value = TRUE)
if (length(cols) == 0) return(NULL) # If no columns match, return NULL
means <- apply(prop.df[, cols, drop = FALSE], 1, function(x) mean(x, na.rm = TRUE))
return(means)
})
# Assign column names to the calculated means
colnames(population_means) <- names(population_groups)
population_means <- as.data.frame(population_means)
# Function to check if any pair of columns has at least a 10% difference
has_10_percent_difference <- function(row) {
combs <- combn(row, 2) # Generate all combinations of two columns
return(any(abs(combs[1, ] - combs[2, ]) >= 0.05))
}
# Add a new column to the dataframe
prop.df$difference_10_percent <- apply(population_means, 1, has_10_percent_difference)
population_means$difference_10_percent <- apply(population_means, 1, has_10_percent_difference)
#clean dataframe for sites with less than 10% difference
prop <- prop.df[prop.df$difference_10_percent == TRUE, ]
prop <- prop[,c(1:35)]
Here, we can use a function that plots the PCA so we can plot several at once
Here we want to plot the amount of variance described by each PC axis. We do this by using the eigenvalues and converting this to a percentage of variance.
We can make plots looking at the first 3 axes of the PCA.
##PCA loadings